import numpy as np
import pandas as pd
import seaborn as sns
import plotly.io as pio
import plotly.express as px
import tensorflow as tf
import datetime, os, pickle
from sklearn.model_selection import train_test_split
print(tf.__version__)
from tensorflow.keras.layers import Input, Dense, Embedding, Flatten, Concatenate, BatchNormalization, Reshape, Dropout, Activation
from tensorflow.keras.optimizers import *
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical, normalize, plot_model
baseball = pd.read_parquet('pitch_prediction_data.parquet')
baseball.shape
The raw data is scraped from the baseball website Savant, and includes all pitch-by-pitch data from 2014 to 2019. The data cleaning step (using pyspark) is in the pitchnet_data_prep_spark.ipynb file. I have also simplified the raw data by keeping the most common 9 pitch types. The glossary of MLB pitch types could be found here.
There are 3 continuous predictors we will feed into the classification neural network: the release speed, horizontal break, and vertical break.
The target variable is pitch_type.
baseball.head()
n_pitcher = np.unique(baseball.pitcher).shape[0]
print('Number of pitchers: {}'.format(n_pitcher))
n_pitch_type = np.unique(baseball.pitch_type).shape[0]
print('Number of pitch types: {}'.format(n_pitch_type))
print('Included pitch types: {}'.format(np.unique(baseball.pitch_type)))
sns.set(rc={'figure.figsize':(10,6.5)})
# generate a small sample of the data
samp = baseball.sample(frac = 0.3, random_state=1)
Some pitches have a very similar distribution of release speed, i.e. FF (four-seam fastball), FT (two-seam fastball), SI (sinker).
sns.boxenplot(x="pitch_type", y="release_speed", data=samp, showfliers = False);
FF (four-seam fastball) seems to be distinguishable from other fastballs if we look at the vertical break.
sns.boxenplot(x="pitch_type", y="vertical_break", data=samp, showfliers = False);
The horizontal break would have opposite signs for left-handed and right-handed pitchers:
sns.boxenplot(x="pitch_type", y="horizontal_break", hue = 'p_throws', data=samp, showfliers = False);
Here I computed the average release speed, vertical break, horizontal break for each pitch type for each pitcher, and plot it in a 3d scatter plot. The points are colored by the pitch type.
arsenal = baseball.groupby(['player_name','pitch_type','p_throws']).agg({'vertical_break':'mean', 'horizontal_break':'mean','release_speed':'mean'}).reset_index()
fig = px.scatter_3d(arsenal, x='horizontal_break', y='vertical_break', z='release_speed',
color='pitch_type',symbol ='p_throws',hover_name='player_name',color_discrete_sequence=px.colors.qualitative.Alphabet)
fig.update_traces(marker=dict(size=2))
fig.show()
# pio.write_html(fig, file='arsenal.html')
Different pitchers could have very different pitching profiles, which could cause problems when classifying the pitches. The density plot of release speed below demonstrates this issue. For example, the pitcher Ryan Yarbrough has a cutter (FC) around 85mph, while the pitcher Tyler Glasnow has his slider (SL) in the similar range. Having an embedding layer allows us to encode pitcher information into our network.
comparison = baseball[(baseball.player_name == 'Ryan Yarbrough') | (baseball.player_name == 'Tyler Glasnow')]
pitch_palette = dict(zip(['CH','CU','FS','KC','SL','SI','FF','FC','FT'], ['C{}'.format(i) for i in range(0,9)]))
g = sns.FacetGrid(comparison,col = 'player_name', hue='pitch_type', palette=pitch_palette, height=5)
g = (g.map(sns.distplot, 'release_speed', hist=False, kde_kws={"shade": True}, vertical=True).set_axis_labels("density (by pitch_type)", "release speed"))
for ax in g.axes.ravel():
ax.legend()
g.fig.suptitle('Release Speed of different pitch types', y=1.05);
The density plot below shows the horizontal break of different pitch types. Again, the plot shows that it is hard to distinguish between Yarbrough's cutter (FC) and Glasnow's slider (SL). Note that since Glasnow is right-handed and Yarbrough is left-handed, the signs of horizontal break (for the same pitch type) would likely to be opposite.
comparison = baseball[(baseball.player_name == 'Ryan Yarbrough') | (baseball.player_name == 'Tyler Glasnow')]
pitch_palette = dict(zip(['CH','CU','FS','KC','SL','SI','FF','FC','FT'], ['C{}'.format(i) for i in range(0,9)]))
g = sns.FacetGrid(comparison,col = 'player_name', hue='pitch_type', palette=pitch_palette, height=5)
g = (g.map(sns.distplot, 'horizontal_break', hist=False, kde_kws={"shade": True}, vertical=True).set_axis_labels("density (by pitch_type)", "horizontal break"))
for ax in g.axes.ravel():
ax.legend()
g.fig.suptitle('Horizontal Break of different pitch types', y=1.05);
Here we define the model using the functional API. The structure is pretty simple: first we normalize the continuous inputs by applying a batchnorm layer, then we concatenate it with the pitcher embedding layer. After the concatenation, we add dense layers with relu activation followed by batchnorm layers. The last layer is a dense layer with softmax activation.
def make_PitchNet(n_pitch_type, n_pitcher, continuous_cols, embedding_dim, hidden_layer_dim):
# Define inputs
all_cols = continuous_cols + ['pitcher']
inputs = {col: Input(shape=(1,), name=col) for col in all_cols}
# Reshape and apply batchnorm to continous inputs
continuous_bn = Concatenate()([Reshape((1,1), name='reshape_' + col)(inputs[col]) for col in continuous_cols])
continuous_bn = BatchNormalization()(continuous_bn)
# Embedding layer for pitcher_id
embeddings = Embedding(input_dim=n_pitcher+1,
output_dim=embedding_dim,
input_length=1,
name='emb_pitcher')(inputs['pitcher'])
# Concatenate the continuous variables with the embeddings
x = Concatenate()([embeddings] + [continuous_bn])
x = Flatten()(x)
# Hidden layer: Dense layer with relu activation followed by batch normalization
for i, dim in enumerate(hidden_layer_dim):
x = Dense(dim, activation = 'relu', name = 'Dense_{}'.format(i))(x)
x = BatchNormalization()(x)
# Use a dense layer with softmax activation to predict the pitch type
pitch_type_out = Dense(n_pitch_type,activation='softmax',name = 'last_layer')(x)
# Define the model
pitchnet_model = Model([inputs[f] for f in all_cols], pitch_type_out, name = 'PitchNet')
return(pitchnet_model)
# set parameters
n_pitch_type = np.unique(baseball.pitch_type).shape[0]
n_pitcher = np.unique(baseball.pitcher).shape[0]
continuous_cols = ['horizontal_break','vertical_break','release_speed']
model = make_PitchNet(n_pitch_type,
n_pitcher,
continuous_cols,
embedding_dim = 50,
hidden_layer_dim = [30,15])
plot_model(model)
# factorize pitcher_id
pitcher_id, pitcher_mapping = baseball.pitcher.factorize()
# factorize and one-hot encode the target
target, target_mapping = baseball.pitch_type.factorize()
target = to_categorical(target)
data = pd.DataFrame(np.concatenate( (baseball[continuous_cols], pitcher_id.reshape(-1,1)), axis = 1))
# Create train, validation, test sets
np.random.seed(0)
train, val = train_test_split(data, test_size=0.2)
val, test = train_test_split(val, test_size=0.5)
x_train = [train.to_numpy()[:,i] for i in range(train.shape[1])]
y_train = target[train.index,]
x_val = [val.to_numpy()[:,i] for i in range(val.shape[1])]
y_val = target[val.index,]
x_test = [test.to_numpy()[:,i] for i in range(test.shape[1])]
y_test = target[test.index,]
x_train
y_train
print(y_train.shape)
print(y_val.shape)
print(y_test.shape)
# Compile the model
LEARNING_RATE = 1e-4
optimizer = Adam(learning_rate=LEARNING_RATE)
model.compile(optimizer=optimizer,
loss = 'categorical_crossentropy',
metrics = ['accuracy'])
Note that there are tons of parameters to be learned in the embedding layer. It grows linearly with the embedding dimension. In our case, since we have 1523 unique pitchers and 50 dimensions, we have: $(1523+1) * 50 = 76200$ parameters to learn. The $+1$ is for unseen pitcher in the training set.
model.summary()
# Define Callbacks
# Early stopping
PATIENCE = 3
early_stopping_monitor = EarlyStopping(patience=PATIENCE)
# Fit the model
NUM_EPOCHS = 20
BATCH_SIZE = 2000
model.fit(x_train, y_train, epochs=NUM_EPOCHS, validation_data=(x_val, y_val), batch_size=BATCH_SIZE, callbacks=[early_stopping_monitor])
# Plot model loss over 20 epochs
from matplotlib import pyplot as plt
plt.plot(model.history.history['loss'])
plt.plot(model.history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper right')
plt.show()
# Plot model accuracy over 20 epochs
plt.plot(model.history.history['accuracy'])
plt.plot(model.history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='lower right')
plt.show()
The model achieves a 0.948 accuracy on the test set:
# test set accuracy
model.evaluate(x_test, y_test)
model.save('model')
We are also interested in finding the best structure, i.e. the number of nodes of the hidden layers, the number of hidden layers, and the dimension of the embedding layer.
LEARNING_RATE = 1e-4
NUM_EPOCHS = 20
BATCH_SIZE = 2000
PATIENCE = 3
embedding_dim_grid = [25, 50, 75]
hidden_layer_dim_grid = [[15],[30],[30,15]]
optimizer = Adam(learning_rate=LEARNING_RATE)
early_stopping_monitor = EarlyStopping(patience=PATIENCE)
best_accuracy = -np.inf
for i in embedding_dim_grid:
for j in hidden_layer_dim_grid:
model = make_PitchNet(n_pitch_type,
n_pitcher,
continuous_cols,
embedding_dim = i,
hidden_layer_dim = j)
model.compile(optimizer=optimizer, loss = 'categorical_crossentropy', metrics = ['accuracy'])
model.fit(x_train, y_train, epochs=NUM_EPOCHS, validation_data=(x_val, y_val), batch_size=BATCH_SIZE, callbacks=[early_stopping_monitor],verbose = False)
print('embedding_dim = {}, hidden_layer_dim = {}'.format(i,j))
test_acc = model.evaluate(x_test, y_test)
if test_acc[1] > best_accuracy:
best_accuracy = test_acc[1]
best_params = {'embedding_dim': i, 'hidden_layer_dim': j}
best_params
The grid search for optimal structure shows that a higher embeddding dimension leads to a higher accuracy, however the increase is marginal, so it's probably not worth increasing the embedding dimension. On the other hand, the number of nodes (30 v.s. 15) in the hidden layer seems to have a larger effect on the accuracy. Having a deeper structure (2 layers v.s. 1 layer) also seems to gives a marginal increase in accuracy.
In this notebook, I successfully implemented the PitchNet for a 9-class pitch classification task. Although the PitchNet has a fairly simple structure, it is already able to achieve a 95% accuracy. The model does not seem to overfit given its simple structure. However, some dropout layers after the embedding layer or the hidden layers could also be added.